import pandas as pd
import matplotlib.pyplot as plt
import re

"""
This file is to calculate the frequency that the themes occurred per activity

@author: Jonathan van Oudheusden
@date: 2024-01-28

Required files: 
    - 'combinedCodes.xlsx'
    
Output files:
    - 'themesPerActivity.png'
"""

# Load the Excel file
file_path = 'combinedCodes.xlsx'
df = pd.read_excel(file_path)

# Relevant clusters for analysis
relevant_clusters = [
    "Self-efficacy", "Practical knowledge", "Awareness of positive outcomes",
    "Awareness of negative outcomes", "Mindset that physical activity helps to quit smoking"
]

# Themes and their codes
ThemesAndCodes = {
    "Motivational Deficits": {"Empty", "Activity wasn't important", "Busy", "Negative about activity"},
    "Behavioral Changes": {"Inspired to do physical activity", "Walks", "Smoking less", "Replacing with other habit", "Distracting from smoking", "Tracking smoking and activity"},
    "Emotional and Cognitive Changes": {"Inspired to do research", "Got motivated to quit", "Aware of obstacles", "Mindfulness", "Thinking of the steps", "Long term attitude"},
    "Embracing Positivity": {"Keep positive outlook", "Discouragement in negative approach", "Saving money", "Benefits of quitting", "Positive about activity"},
    "Negative Aspects Awareness": {"Consequences of smoking", "Health consequences", "Wasting money", "Negatives of quitting", "Struggling with stopping"},
    "Motivation and Mindset": {"Keep positive outlook", "Benefits of quitting", "Attempted self-motivation", "Got motivated to quit", "Motivation from others", "Remembering personal goals and motivation", "Envision future consequences and benefits"}
}       

def splitPeerCodes(ownCodes):
    ownCodesSplit = re.split(r'\s*,\s*', ownCodes)
    return [code.strip() for code in ownCodesSplit]


# Initialize a dictionary to keep track of cluster counts for each theme
cluster_counts = {theme: {cluster: 0 for cluster in relevant_clusters} for theme in ThemesAndCodes}

# Iterate through the DataFrame
for i in range(len(df) - 1):
    current_row = df.iloc[i]
    next_row = df.iloc[i + 1]

    # Check if the current row's cluster is relevant and if the next row has the same rand_id                                                                                                                                                                               
    if current_row['cluster_new_index'] in relevant_clusters and current_row['rand_id'] == next_row['rand_id']:
        # Handle potential non-string values in 'My codes'
        my_codes = next_row['My codes']
        if pd.notna(my_codes):
            codes = splitPeerCodes(my_codes.lower())

            for code in codes:
                for theme, theme_codes in ThemesAndCodes.items():
                    if code in map(str.strip, map(str.lower, theme_codes)):
                        cluster_counts[theme][current_row['cluster_new_index']] += 1

                        

fig, ax = plt.subplots(figsize=(15, 15), constrained_layout=True)
for i, theme in enumerate(ThemesAndCodes):
    counts = [cluster_counts[theme][cluster] for cluster in relevant_clusters]
    ax.bar([x + i*0.1 for x in range(len(relevant_clusters))], counts, width=0.1, label=theme)

ax.set_xticks([x + 0.2 for x in range(len(relevant_clusters))])
ax.set_xticklabels(relevant_clusters, rotation=45, ha="right")


ax.legend(title="Themes")
ax.set_ylabel("Number of theme-specific codes Appearing per Activity")     
ax.set_title("Themes for Each Activity")

plt.savefig('themesPerActivity.png')


# Iterate through each theme and cluster to print the counts
for theme, clusters in cluster_counts.items():
    print(f"\nTheme: {theme}")
    for cluster, count in clusters.items():
        print(f"  Cluster '{cluster}': {count} occurrences")
